diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..eed972c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+llm/fed-gen/
+llm/src-gen/
+llm/include/
+llm/bin
+**__pycache__**
+llm/=**
\ No newline at end of file
diff --git a/llm/README.md b/llm/README.md
new file mode 100644
index 0000000..c97f213
--- /dev/null
+++ b/llm/README.md
@@ -0,0 +1,135 @@
+
+# LLM Demo Overview
+This is a quiz-style game between two LLM agents. For each user question typed at the keyboard for the judge, both agents answer in parallel. The Judge announces whichever answer arrives first (or a timeout if neither responds within 60 sec), and prints per-question elapsed logical and physical times.
+
+# Directory Structure
+- [federated](src/federated/) - Directory for federated versions of LLM demos.
+- [agents](src/agents/) - Directory for Python files for various LLM agents.
+
+# Pre-requisites
+
+You need Python >= 3.10 installed.
+
+## Library Dependencies
+To run this project, there are dependencies required which are in [requirements.txt](requirements.txt) file. The model used in this repository has been quantized using 4-bit precision (bnb_4bit) and relies on bitsandbytes for efficient matrix operations and memory optimization. So specific versions of bitsandbytes, torch, and torchvision are mandatory for compatibility.
+While newer versions of other dependencies may work, the specific versions listed below have been tested and are recommended for optimal performance.
+It is highly recommended to create a Python virtual environment or a Conda environment to manage dependencies. \
+To create the a virtual environment follow the steps below.
+
+### Step 1: Creating environment
+```
+python3 -m venv llm
+source llm/bin/activate
+```
+For activating the environment everytime use "source llm/bin/activate".
+or
+```
+conda create -n llm
+conda activate llm
+```
+### Step 2: Installing the required packages
+Check if pip is installed:
+```
+pip --version
+```
+If it is not installed:
+```
+python -m pip install --upgrade pip
+```
+Run this command to install the packages from the [requirements.txt](requirements.txt) file:\
+**Note**: Since we are using LLMs with 7B and 70B parameters it is recommended to have a device with GPU support.
+```
+pip install -r requirements.txt
+```
+To check if all the requirements are installed, run:
+```
+pip list | grep -E "transformers|accelerate|tokenizers|bitsandbytes"
+```
+For installing torch:
+
+1. For devices without GPU
+```
+pip install torch torchvision
+```
+2. For devices with GPU
+ Checking the CUDA version run this command:
+ ```
+ nvidia-smi
+ ```
+ Look for the line "CUDA Version" as shown in the image: \
+
+
+ With the correct version install PyTorch from [PyTorch](https://pytorch.org/get-started/locally/) by selecting the right correct OS and compute platform as shown in the image below for Linux system with CUDA version 12.8: \
+
+### Step 3: Model Dependencies
+- **Pre-trained Models used in the agents/llm.py**: [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) , [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) \
+**Note:** Follow the steps below to obtain the access and authentication key for the hugging face models.
+1. Create the user access token and follow the steps shown on the official documentation: [User access tokens](https://huggingface.co/docs/hub/en/security-tokens)
+2. Log in using the Hugging Face CLI by running huggingface-cli login. Please refer to the official documentation for step-by-step instructions - [HuggingFace CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli)
+3. For the Llama Models you will require access to use the models if you are using it for the first time. Open these links and apply for accessing the models ([meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf))
+
+## System Requirements
+
+To ensure optimal performance, the following hardware and software requirements are utilized. \
+**Note:** To replicate this model, you can use any equivalent hardware that meets the computational requirements.
+
+### Hardware Requirements
+The demo was tested with the following hardware setup.
+- **GPU**: NVIDIA RTX A6000
+
+### Software Requirements
+- **OS**: Linux
+- **Python**
+- **CUDA Version**: 12.8
+
+Make sure the environment is properly configured to use CUDA for optimal GPU acceleration.
+
+# Files and directories in this repository
+ - **`llm_base_class.lf`** - Contains the base reactors LlmA, LlmB, and Judge.
+ - **`llm_quiz_game.lf`** - Lingua Franca program that defines the quiz game reactors (LLM agent A, LLM agent B and Judge).
+
+# Execution Workflow
+
+### Step 1:
+Run the **`llm_quiz_game.lf`**.
+
+**Note:**
+- Ensure that you specify the correct file paths
+
+Run the following commands:
+
+```
+lfc src/llm_quiz_game.lf
+```
+
+### Step 2: Run the binary file and input the quiz question
+Run the following commands:
+
+```
+./bin/llm_quiz_game
+```
+
+The system will ask for entering the quiz question which is to be obtained from the keyboard input.
+
+Example output printed on the terminal:
+
+
+ +-------------------------------------------------- +---- System clock resolution: 1 nsec +---- Start execution on Fri Sep 19 10:46:31 2025 ---- plus 772215861 nanoseconds +Enter the quiz question +What is the capital of South Korea? +Query: What is the capital of South Korea? + +waiting... + +Winner: LLM-B | logical 1184 ms | physical 1184 ms +Answer: Seoul. +-------------------------------------------------- + ++ +# Contributors +- Deeksha Prahlad (dprahlad@asu.edu), Ph.D. student at Arizona State University +- Hokeun Kim (hokeun@asu.edu, https://hokeun.github.io/), Assistant professor at Arizona State University diff --git a/llm/img/cudaversion.png b/llm/img/cudaversion.png new file mode 100644 index 0000000..2b7e874 Binary files /dev/null and b/llm/img/cudaversion.png differ diff --git a/llm/img/pytorch.png b/llm/img/pytorch.png new file mode 100644 index 0000000..3ecd8af Binary files /dev/null and b/llm/img/pytorch.png differ diff --git a/llm/requirements.txt b/llm/requirements.txt new file mode 100644 index 0000000..c8a18f7 --- /dev/null +++ b/llm/requirements.txt @@ -0,0 +1,5 @@ +accelerate +transformers +tokenizers +bitsandbytes>=0.43.0 + diff --git a/llm/src/agents/llm.py b/llm/src/agents/llm.py new file mode 100644 index 0000000..1d88658 --- /dev/null +++ b/llm/src/agents/llm.py @@ -0,0 +1,89 @@ +### Import Libraries +import transformers +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from torch import cuda, bfloat16 + + +### Model to be chosen to act as an agent +model_id = "meta-llama/Llama-2-7b-chat-hf" +model_id_2 = "meta-llama/Llama-2-70b-chat-hf" + +### To check if there is GPU and convert it into float 16 +has_cuda = torch.cuda.is_available() +dtype = torch.bfloat16 if has_cuda else torch.float32 + +### To convert the model into 4bit quantization +bnb_config = None +### if there is cuda then the model is converted to 4bit quantization +if has_cuda: + try: + import bitsandbytes as bnb + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +### calling pre-trained tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, use_fast=True) +for tok in (tokenizer, tokenizer_2): + if tok.pad_token_id is None: + tok.pad_token = tok.eos_token + +### since both the models have same device map and using 4bit quantization for both +common = dict( + device_map="auto" if has_cuda else None, + torch_dtype=dtype, # Changed from dtype=dtype (correct arg name) + low_cpu_mem_usage=True, +) +if bnb_config is not None: + common["quantization_config"] = bnb_config + +### calling pre-trained model +model = AutoModelForCausalLM.from_pretrained(model_id, **common) +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **common) +model.eval(); model_2.eval() + + +### arguments for both the models +GEN_A = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id) +GEN_B = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id) + +###to resturn only one line answers +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +###Calling agent1 from .lf code +def agent1(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt") + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model.generate(**inputs, **GEN_A) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) + +###Calling agent2 from .lf code +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model_2.generate(**inputs, **GEN_B) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/agents/llm_a.py b/llm/src/agents/llm_a.py new file mode 100644 index 0000000..0e888bc --- /dev/null +++ b/llm/src/agents/llm_a.py @@ -0,0 +1,78 @@ +# llm_a.py + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +#Model +model_id = "meta-llama/Llama-2-7b-chat-hf" + + +has_cuda = torch.cuda.is_available() +if not has_cuda: + raise RuntimeError("CUDA GPU required for this configuration.") +dtype = torch.bfloat16 if has_cuda else torch.float32 + +#4-bit quantization +bnb_config = None +if has_cuda: + try: + import bitsandbytes as bnb + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +#Tokenizer and the token is automatically used if logged in via CLI +tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + + +common = dict( + device_map="auto" if has_cuda else None, + torch_dtype=dtype, + low_cpu_mem_usage=True, +) + +if bnb_config is not None: + common["quantization_config"] = bnb_config + +#model +model = AutoModelForCausalLM.from_pretrained(model_id, **common) +model.eval() + +#Generation +GEN_A = dict( + max_new_tokens=24, + do_sample=False, + temperature=0.1, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id +) + +#post-processing +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +#Agent 1 +def agent1(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt") + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model.generate(**inputs, **GEN_A) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/agents/llm_b.py b/llm/src/agents/llm_b.py new file mode 100644 index 0000000..621d43d --- /dev/null +++ b/llm/src/agents/llm_b.py @@ -0,0 +1,81 @@ +# llm_b.py + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +#Model +model_id_2 = "meta-llama/Llama-2-70b-chat-hf" + +#Requires the GPU for this model +has_cuda = torch.cuda.is_available() +if not has_cuda: + raise RuntimeError("CUDA GPU required for this configuration.") +dtype = torch.bfloat16 if has_cuda else torch.float32 + +#4-bit quantization +bnb_config = None +if has_cuda: + try: + import bitsandbytes as bnb + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +#Tokenizer and the token automatically used if logged in via CLI +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, use_fast=True) +if tokenizer_2.pad_token_id is None: + tokenizer_2.pad_token = tokenizer_2.eos_token + + +common = dict( + device_map="auto" if has_cuda else None, + torch_dtype=dtype, + low_cpu_mem_usage=True, +) + +if bnb_config is not None: + common["quantization_config"] = bnb_config + +#Model +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **common) +model_2.eval() + +#Generation +GEN_B = dict( + max_new_tokens=24, + do_sample=False, + temperature=0.1, + eos_token_id=tokenizer_2.eos_token_id, + pad_token_id=tokenizer_2.pad_token_id, +) + +#Post-processing +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +#Agent 2 +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + out = model_2.generate(**inputs, **GEN_B) + + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/agents/llm_b_jetson.py b/llm/src/agents/llm_b_jetson.py new file mode 100644 index 0000000..b57e157 --- /dev/null +++ b/llm/src/agents/llm_b_jetson.py @@ -0,0 +1,47 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +# Model ID +model_id = "meta-llama/Llama-3.2-1B" + +# Check GPU availability +has_cuda = torch.cuda.is_available() +device = torch.device("cuda" if has_cuda else "cpu") +compute_dtype = torch.float16 if has_cuda else torch.float32 + + +common = dict( + low_cpu_mem_usage=True, + attn_implementation="eager", +) + +#Load tokenizer and the token automatically used from CLI login +tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + +#Load model +mp_kwargs = dict(torch_dtype=compute_dtype, **common) +model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs) +model.to(device) +model.eval() + +#Generation +GEN = dict( + max_new_tokens=64, + do_sample=True, + temperature=0.7, + top_p=0.95, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, +) + +#Agent 2 +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt").to(device) + with torch.inference_mode(): + out = model.generate(**inputs, **GEN) + gen = out[0, inputs["input_ids"].shape[1]:] + return tokenizer.decode(gen, skip_special_tokens=True).strip() diff --git a/llm/src/agents/llm_b_m2.py b/llm/src/agents/llm_b_m2.py new file mode 100644 index 0000000..adf4e15 --- /dev/null +++ b/llm/src/agents/llm_b_m2.py @@ -0,0 +1,92 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +#Model +model_id_2 = "google/gemma-3-270m" + +#Device setup +has_cuda = torch.cuda.is_available() +has_mps = torch.backends.mps.is_available() + +if has_cuda: + device = torch.device("cuda") + compute_dtype = torch.float16 +elif has_mps: + device = torch.device("mps") + compute_dtype = torch.float32 +else: + device = torch.device("cpu") + compute_dtype = torch.float32 + +#Common model kwargs +common = dict( + low_cpu_mem_usage=True, + attn_implementation="eager" +) + +#4-bit quantization on CUDA if available +if has_cuda: + try: + import bitsandbytes as bnb + common["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=compute_dtype, + ) + common["device_map"] = "auto" + except Exception: + print("[WARN] bitsandbytes not available; using full-precision fp16 on CUDA.", flush=True) + common["device_map"] = "auto" +else: + common["device_map"] = None + +#Tokenizer and the token automatically used if logged in via CLI +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, use_fast=True) +if tokenizer_2.pad_token_id is None: + tokenizer_2.pad_token = tokenizer_2.eos_token + +# Model +mp_kwargs = dict(dtype=compute_dtype, **common) +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **mp_kwargs) + + +if not has_cuda: + model_2.to(device) +model_2.eval() + +# Generation +GEN_B = dict( + max_new_tokens=32, + do_sample=True, + eos_token_id=tokenizer_2.eos_token_id, + pad_token_id=tokenizer_2.pad_token_id, +) + +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + i = t.find(sep) + if i > 0: + t = t[:i] + break + return t.strip().strip(":").strip() + +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + elif has_mps: + inputs = {k: v.to("mps") for k, v in inputs.items()} + else: + inputs = {k: v.to("cpu") for k, v in inputs.items()} + + with torch.inference_mode(): + out = model_2.generate(**inputs, **GEN_B) + + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/agents/llm_small.py b/llm/src/agents/llm_small.py new file mode 100644 index 0000000..a02de1d --- /dev/null +++ b/llm/src/agents/llm_small.py @@ -0,0 +1,89 @@ +### Import Libraries +import transformers +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from torch import cuda, bfloat16 + + +### Model to be chosen to act as an agent +model_id = "microsoft/Phi-3.5-mini-instruct" +model_id_2 = "EleutherAI/pythia-70m" + +### To check if there is GPU and convert it into float 16 +has_cuda = torch.cuda.is_available() +dtype = torch.bfloat16 if has_cuda else torch.float32 + +### To convert the model into 4bit quantization +bnb_config = None +### if there is cuda then the model is converted to 4bit quantization +if has_cuda: + try: + import bitsandbytes as bnb + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +### calling pre-trained tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, use_fast=True) +for tok in (tokenizer, tokenizer_2): + if tok.pad_token_id is None: + tok.pad_token = tok.eos_token + +### since both the models have same device map and using 4bit quantization for both +common = dict( + device_map="auto" if has_cuda else None, + dtype=dtype, + low_cpu_mem_usage=True, +) +if bnb_config is not None: + common["quantization_config"] = bnb_config + +### calling pre-trained model +model = AutoModelForCausalLM.from_pretrained(model_id, **common) +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **common) +model.eval(); model_2.eval() + + +### arguments for both the models +GEN_A = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id) +GEN_B = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id) + +###to resturn only one line answers +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +###Calling agent1 from .lf code +def agent1(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt") + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model.generate(**inputs, **GEN_A) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) + +###Calling agent2 from .lf code +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model_2.generate(**inputs, **GEN_B) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/federated/README.md b/llm/src/federated/README.md new file mode 100644 index 0000000..ef8970b --- /dev/null +++ b/llm/src/federated/README.md @@ -0,0 +1,154 @@ +# LLM Demo (Federated Execution) Overview + +This is a quiz-style game between two LLM agents using federated execution. For each user question asked to the Judge, both agents answer in parallel. The Judge announces whichever answer arrives first (or a timeout if neither responds within 60 sec), and prints per-question elapsed logical and physical times. There are three federates (federate__llma, federate__llmb, federate__j) and an RTI. + +# Pre-requisites + +You need Python >= 3.10 installed. + +## Library Dependencies +To run this project, there are dependencies required which are in [requirements.txt](requirements.txt) file. The model used in this repository has been quantized using 4-bit precision (bnb_4bit) and relies on bitsandbytes for efficient matrix operations and memory optimization. So specific versions of bitsandbytes, torch, and torchvision are mandatory for compatibility. +While newer versions of other dependencies may work, the specific versions listed below have been tested and are recommended for optimal performance. +It is highly recommended to create a Python virtual environment or a Conda environment to manage dependencies. \ +To create the a virtual environment follow the steps below. + +### Step 1: Creating environment +Replace this <> with the environment name +``` +python3 -m venv
+
+ With the correct version install PyTorch from [PyTorch](https://pytorch.org/get-started/locally/) by selecting the right correct OS and compute platform as shown in the image below for Linux system with CUDA version 12.8: \
+
+### Step 3: Model Dependencies
+- **Pre-trained Models used in the agents/llm_a.py and agents/llm_b.py**: [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) , [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) \
+**Note:** Follow the steps below to obtain the access and authentication key for the hugging face models.
+1. Create the user access token and follow the steps shown on the official documentation: [User access tokens](https://huggingface.co/docs/hub/en/security-tokens)
+2. Log in using the Hugging Face CLI by running huggingface-cli login. Please refer to the official documentation for step-by-step instructions - [HuggingFace CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli)
+3. For the Llama Models you will require access to use the models if you are using it for the first time. Open these links and apply for accessing the models ([meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf))
+
+## System Requirements
+
+To ensure optimal performance, the following hardware and software requirements are utilized. \
+**Note:** To replicate this model, you can use any equivalent hardware that meets the computational requirements.
+
+### Hardware Requirements
+The demo was tested with the following hardware setup.
+- **GPU**: NVIDIA RTX A6000
+
+### Software Requirements
+- **OS**: Linux
+- **Python**
+- **CUDA Version**: 12.8
+
+Make sure the environment is properly configured to use CUDA for optimal GPU acceleration.
+
+# Files and directories in this repository
+ - **`llm_base_class_federate.lf`** - Contains the base reactors LlmA, LlmB and Judge.
+ - **`llm_game_federated.lf`** - Lingua Franca program that defines the quiz game as federated execution.
+
+# Execution Workflow
+
+### Step 1:
+To compile this specify the RTI host by specifying an IP address here:
+```
+federated reactor llm_game_federated at 10.xxx.xxx.xx {
+}
+```
+
+Run the **`llm_game_federated.lf`**.
+
+**Note:**
+- Ensure that you specify the correct file paths
+
+Run the following commands:
+
+```
+lfc src/federated_execution/llm_game_federated.lf
+```
+
+### Step 2: Run the binary file and input the quiz question
+Run the following command:
+
+```
+cd fed-gen/llm_game_federated/
+```
+
+In the first terminal run:
+```
+./bin/RTI -n 3
+
+```
+In the second terminal run:
+```
+./bin/federate__j
+
+```
+In the third terminal run:
+```
+./bin/federate__llma
+
+```
+In the fourth terminal run:
+```
+./bin/federate__llmb
+
+```
+
+The system will ask for entering the quiz question which is to be obtained from the keyboard input.
+
+Example output printed on the terminal:
+
++ +-------------------------------------------------- +---- System clock resolution: 1 nsec +---- Start execution on Fri Sep 19 10:46:31 2025 ---- plus 772215861 nanoseconds +Enter the quiz question +What is the capital of South Korea? +Query: What is the capital of South Korea? + +waiting... + +Winner: LLM-B | logical 1184 ms | physical 1184 ms +Answer: Seoul. +-------------------------------------------------- + ++ +# Contributors +- Deeksha Prahlad (dprahlad@asu.edu), Ph.D. student at Arizona State University +- Hokeun Kim (hokeun@asu.edu, https://hokeun.github.io/), Assistant professor at Arizona State University + diff --git a/llm/src/federated/llm_base_class_federate.lf b/llm/src/federated/llm_base_class_federate.lf new file mode 100644 index 0000000..57171ed --- /dev/null +++ b/llm/src/federated/llm_base_class_federate.lf @@ -0,0 +1,244 @@ +target Python + +### Reactor for calling agent 1 +reactor LlmA { + state th + state running = False + state out_buffer = "" + state ready = False + + input user_in + physical action done + physical action notify_ready + output answer + output ready_out + + reaction(startup) -> notify_ready {= + import os, sys, importlib.util, threading, traceback + act = notify_ready + def _load(): + try: + here = os.path.dirname(__file__) + if here not in sys.path: sys.path.insert(0, here) + from llm_a import agent1 + act.schedule(1) + except Exception as e: + print("[LlmA] Preload failed:", e, flush=True) + traceback.print_exc() + threading.Thread(target=_load, daemon=True).start() + =} + + reaction(notify_ready) -> ready_out {= + self.ready = True + ready_out.set(True) + =} + + reaction(user_in) -> done {= + import threading + if not self.ready: return + if self.running: return + self.running = True + q = user_in.value + from llm_a import agent1 + def agentA(): + try: + self.out_buffer = agent1(q) + finally: + try: done.schedule(5) + except Exception as e: print("[LlmA] schedule failed:", e, flush=True) + self.th = threading.Thread(target=agentA, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} + +### Reactor for calling agent 2 +reactor LlmB { + state th + state running = False + state out_buffer = "" + state ready = False + + input user_in + physical action done + physical action notify_ready + output answer + output ready_out + + reaction(startup) -> notify_ready {= + import os, sys, importlib.util, threading, traceback + act = notify_ready + def _load(): + try: + here = os.path.dirname(__file__) + if here not in sys.path: sys.path.insert(0, here) + from llm_b import agent2 + act.schedule(1) + except Exception as e: + print("[LlmB] Preload failed:", e, flush=True) + traceback.print_exc() + threading.Thread(target=_load, daemon=True).start() + =} + + reaction(notify_ready) -> ready_out {= + self.ready = True + ready_out.set(True) + =} + + reaction(user_in) -> done {= + import threading + if not self.ready: return + if self.running: return + self.running = True + q = user_in.value + from llm_b import agent2 + def agentB(): + try: + self.out_buffer = agent2(q) + finally: + try: done.schedule(5) + except Exception as e: print("[LlmB] schedule failed:", e, flush=True) + self.th = threading.Thread(target=agentB, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} +###Judge reactor to determine which agent responds first + +reactor Judge { + state th + state reader_started = False + state terminate = False + state eof = False + state buffer = "" + state waiting = False + state logical_base_time = 0 + state physical_base_time = 0 + input ready_a + input ready_b + state a_ready = False + state b_ready = False + physical action line + physical action tick + logical action timeout(60 sec) + output ask + input llma + input llmb + output quit + + reaction(startup) {= + print("[Judge] Waiting for models to load", flush=True) + =} + + reaction(ready_a)->line {= + self.a_ready = True + if self.a_ready and self.b_ready and not self.reader_started: + import sys, threading + def reader(): + while not self.terminate: + s = input("Enter the quiz question (or 'quit')\n") + if s == "" or s.lower().strip() == "quit": + self.eof = True + try: line.schedule(0) + except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True) + break + else: + self.buffer = s + try: line.schedule(1) + except Exception as e: + print("[Judge] schedule line failed:", e, flush=True) + break + self.reader_started = True + print("[Judge] Models ready. You can ask questions now.", flush=True) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(ready_b)->line {= + self.b_ready = True + if self.a_ready and self.b_ready and not self.reader_started: + import sys, threading + def reader(): + while not self.terminate: + s = input("Enter the quiz question (or 'quit')\n") + if s == "" or s.lower().strip() == "quit": + self.eof = True + try: line.schedule(0) + except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True) + break + else: + self.buffer = s + try: line.schedule(1) + except Exception as e: + print("[Judge] schedule line failed:", e, flush=True) + break + self.reader_started = True + print("[Judge] Models ready. You can ask questions now.", flush=True) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(line) -> tick, ask, timeout, quit {= + if self.eof: + quit.set() + environment().sync_shutdown() + else: + self.waiting = True + self.logical_base_time = lf.time.logical_elapsed() + self.physical_base_time = lf.time.physical_elapsed() + timeout.schedule(0) + print(f"\n\n\nQuery: {self.buffer}\n", flush=True) + print("waiting...\n", flush=True) + tick.schedule(5) + =} + + reaction(tick) -> ask {= + ask.set(self.buffer) + =} + + reaction(llma) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + print(f"{llma.value}", flush=True) + =} + + reaction(llmb) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + print(f"{llmb.value}", flush=True) + =} + + reaction(timeout) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + =} + + reaction(shutdown) {= + self.terminate = True + if self.th and self.th.is_alive(): + self.th.join() + =} +} \ No newline at end of file diff --git a/llm/src/federated/llm_game_federated.lf b/llm/src/federated/llm_game_federated.lf new file mode 100644 index 0000000..7e6b961 --- /dev/null +++ b/llm/src/federated/llm_game_federated.lf @@ -0,0 +1,29 @@ +### llm.py file needs to be in the same directory +target Python { keepalive: true, files: ["../../../src/agents/llm_a.py", "../../../src/agents/llm_b.py" ] } #"llm_b.py" + +import LlmA, LlmB, Judge from "llm_base_class_federate.lf" + +preamble {= + import threading + import time + from llm_a import agent1 + from llm_b_m2 import agent2 +=} + + +federated reactor llm_game_federated at 10.218.100.95 { + + j = new Judge() + llma = new LlmA() + llmb = new LlmB() + + j.ask -> llma.user_in + j.ask -> llmb.user_in + llma.answer -> j.llma + llmb.answer -> j.llmb + + llma.ready_out -> j.ready_a + llmb.ready_out -> j.ready_b + +} + diff --git a/llm/src/llm_base_class.lf b/llm/src/llm_base_class.lf new file mode 100644 index 0000000..ae6f1c7 --- /dev/null +++ b/llm/src/llm_base_class.lf @@ -0,0 +1,161 @@ +target Python + + +### Reactor for calling agent 1 +reactor LlmA { + state th + state running = False + state out_buffer = "" + + input user_in + physical action done + output answer + + + reaction(user_in) -> done {= + if self.running: + return + self.running = True + query = user_in.value + def agentA(): + try: + self.out_buffer = agent1(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentA, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} + + +### Reactor for calling agent 2 +reactor LlmB { + state th + state running = False + state out_buffer = "" + input user_in + output answer + + physical action done + + reaction(user_in)->done{= + if self.running: + return + self.running = True + query = user_in.value + def agentB(): + try: + self.out_buffer = agent2(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentB, daemon=True) + self.th.start() + =} + + reaction(done)->answer{= + self.running = False + answer.set(self.out_buffer) + =} + +} + +### Reactor for Judge +reactor Judge { + state th + state terminate = False + state eof = False + state buffer = "" + + output ask + output quit + input llma + input llmb + + state waiting = False + state logical_base_time = 0 + state physical_base_time = 0 + state winner = "" + + physical action line + logical action timeout(60 sec) + + reaction(startup) -> line {= + def reader(): + while not self.terminate: + s = input("Enter the quiz question\n") + if s == "": + self.eof = True + line.schedule(0) + break + elif s.lower().strip() == "quit": + self.eof = True + line.schedule(0) + break + else: + self.buffer = s + line.schedule(1) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(line) -> ask, quit, timeout {= + if self.eof: + quit.set() + environment().sync_shutdown() + else: + self.waiting = True + self.winner = "" + self.logical_base_time = lf.time.logical_elapsed() + self.physical_base_time = lf.time.physical_elapsed() + timeout.schedule(0) + print(f"\n\n\nQuery: {self.buffer}\n") + print("waiting...\n") + ask.set(self.buffer) + =} + + reaction(llma) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llma.value}") + =} + + reaction(llmb) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llmb.value}") + =} + + reaction(timeout) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") + =} + + reaction(shutdown) {= + self.terminate = True + if self.th and self.th.is_alive(): + self.th.join() + =} +} \ No newline at end of file diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf new file mode 100644 index 0000000..9f05ce6 --- /dev/null +++ b/llm/src/llm_quiz_game.lf @@ -0,0 +1,23 @@ +target Python { keepalive: true, files: ["agents/llm.py"] } + + +import LlmA from "llm_base_class.lf" +import LlmB from "llm_base_class.lf" +import Judge from "llm_base_class.lf" + +preamble {= + import threading + import time + from llm import agent1, agent2 +=} + +main reactor { + llma_response = new LlmA() + llmb_response = new LlmB() + j = new Judge() + + j.ask -> llma_response.user_in + j.ask -> llmb_response.user_in + llma_response.answer -> j.llma + llmb_response.answer -> j.llmb +} \ No newline at end of file