lf-lang · Deeksha-20-99 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+llm/fed-gen/
+llm/src-gen/
+llm/include/
+llm/bin
+**__pycache__**
+llm/=**
diff --git a/llm/README.md b/llm/README.md
@@ -0,0 +1,95 @@
+# LLM Demo
+
+# Overview
+This is a quiz-style game between two LLM agents. For each user question typed at the keyboard, both agents answer in parallel. The Judge announces whichever answer arrives first (or a timeout if neither responds within 60 sec), and prints per-question elapsed logical and physical times. 
+
+# Pre-requisites 
+
+You need Python installed, as llm.py is written in Python.
+
+## Library Dependencies
+To run this project, the following dependencies are required. The model used in this repository has been quantized using 4-bit precision (bnb_4bit) and relies on bitsandbytes for efficient matrix operations and memory optimization. So specific versions of bitsandbytes, torch, and torchvision are mandatory for compatibility. 
+While newer versions of other dependencies may work, the specific versions listed below have been tested and are recommended for optimal performance.
+
+It is highly recommended to create a Python virtual environment or a Conda environment to manage dependencies. The available options for environment setup are listed below.
+
+```
+pip install accelerate
+pip install transformers
+pip install tokenizers
+pip install bitsandbytes>=0.43.0
+pip install torch
+pip install torchvision
+```
+
+## System Requirements  
+
+To ensure optimal performance, the following hardware and software requirements are utilized. \
+**Note:** To replicate this model, you can use any equivalent hardware that meets the computational requirements.
+
+### Hardware Requirements   
+- **GPU**: NVIDIA RTX A6000  
+
+### Software Requirements  
+- **Python** (Ensure Python is installed)  
+- **CUDA Version**: 12.8  
+- **NVIDIA-SMI**: For monitoring GPU performance and memory utilization  
+
+### Model Dependencies  
+- **Pre-trained Models**:  [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)  [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) 
+**Note:** Please access and use the pre-trained models, authentication keys must be obtained from the [Hugging Face repository](https://huggingface.co/settings/tokens). Ensure you have a valid API token and configure authentication.
+
+Make sure the environment is properly configured to use CUDA for optimal GPU acceleration.
+
+# Files and directories in this repository
+  - **`llm.py`** - Contains the logic to load and call LLM models from the Hugging Face pretrained hub.
+  - **`llm_quiz_game.lf`** - Lingua Franca program that defines the quiz game reactors (Keyboard input, LLM agents, and Judge).
+
+# Execution Workflow 
+
+### Step 1: 
+Run the **`llm_quiz_game.lf`**.  
+
+**Note:**  
+- Ensure that you specify the correct file paths
+
+Run the following commands:  
+
+```
+lfc src/llm_quiz_game.lf
+```
+
+### Step 2: Run the binary file and input the quiz question
+Run the following commands:  
+
+```
+./bin/llm_quiz_game
+```
+
+The system will ask for entering the quiz question which is to be obtained from the keyboard input.
+
+Example output printed on the terminal:
+
+<pre>
+
+--------------------------------------------------
+---- System clock resolution: 1 nsec
+---- Start execution on Fri Sep 19 10:46:31 2025 ---- plus 772215861 nanoseconds
+Enter the quiz question
+What is the capital of South Korea?
+Query: What is the capital of South Korea?
+
+waiting...
+
+Winner: LLM-B | logical 1184 ms | physical 1184 ms
+Answer: Seoul.
+--------------------------------------------------
+
+</pre>
+
+### Step 3: Monitoring GPU Performance (Optional)
+In another terminal, monitor GPU performance and memory utilization while running the scripts, please use NVIDIA-SMI:
+```
+nvidia-smi
+```
+# Contributors
diff --git a/llm/requirements.txt b/llm/requirements.txt
@@ -0,0 +1,7 @@
+accelerate
+transformers
+tokenizers
+bitsandbytes>=0.43.0
+torch
+torchvision
+
diff --git a/llm/src/llm.py b/llm/src/llm.py
@@ -0,0 +1,92 @@
+### Import Libraries 
+import transformers
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from torch import cuda, bfloat16
+
+### Add Your hugging face token here 
+hf_auth = "Add your token here"
+
+### Model to be chosen to act as an agent 
+model_id = "meta-llama/Llama-2-7b-chat-hf"  
+model_id_2 = "meta-llama/Llama-2-70b-chat-hf" 
+
+### To check if there is GPU and convert it into float 16
+has_cuda = torch.cuda.is_available()
+dtype = torch.bfloat16 if has_cuda else torch.float32   
+
+### To convert the model into 4bit quantization 
+bnb_config = None
+### if there is cuda then the model is converted to 4bit quantization
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=dtype,
+        )
+    except Exception:
+        bnb_config = None  
+
+### calling pre-trained tokenizer
+tokenizer   = AutoTokenizer.from_pretrained(model_id,   token=hf_auth, use_fast=True)
+tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True)
+for tok in (tokenizer, tokenizer_2):
+    if tok.pad_token_id is None:
+        tok.pad_token = tok.eos_token
+
+### since both the models have same device map and using 4bit quantization for both
+common = dict(
+    device_map="auto" if has_cuda else None,
+    dtype=dtype,                 
+    low_cpu_mem_usage=True,
+)
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
+
+### calling pre-trained model
+model   = AutoModelForCausalLM.from_pretrained(model_id,   token=hf_auth, **common)
+model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common)
+model.eval(); model_2.eval()
+
+
+
+### arguments for both the models 
+GEN_A = dict(max_new_tokens=24, do_sample=False, temperature=0.1,
+             eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
+GEN_B = dict(max_new_tokens=24, do_sample=False, temperature=0.1,
+             eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id)
+
+###to resturn only one line answers
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+###Calling agent1 from .lf code
+def agent1(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model.generate(**inputs, **GEN_A)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+    return postprocess(result)
+
+###Calling agent2 from .lf code
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer_2(prompt, return_tensors="pt")
+    if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model_2.generate(**inputs, **GEN_B)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True)
+    return postprocess(result)
diff --git a/llm/src/llm_a.py b/llm/src/llm_a.py
@@ -0,0 +1,77 @@
+# llm_a.py
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+# <<< put your token here >>>
+hf_auth = "add token here "
+
+# Model 
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+
+# Require GPU 
+has_cuda = torch.cuda.is_available()
+if not has_cuda:
+    raise RuntimeError("CUDA GPU required for this configuration.")
+dtype = torch.bfloat16 if has_cuda else torch.float32
+
+# 4-bit quantization
+bnb_config = None
+if has_cuda:
+    try:
+        import bitsandbytes as bnb 
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=dtype,
+        )
+    except Exception:
+        bnb_config = None
+
+# Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+# Shared kwargs
+common = dict(
+    device_map="auto" if has_cuda else None,
+    dtype=dtype,
+    low_cpu_mem_usage=True,
+)
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
+
+# Model
+model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_auth, **common)
+model.eval()
+
+# Generation args
+GEN_A = dict(
+    max_new_tokens=24, do_sample=False, temperature=0.1,
+    eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id
+)
+
+# One-line postprocess
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+# Agent 1 
+def agent1(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model.generate(**inputs, **GEN_A)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+    print(result)
+    return postprocess(result)
diff --git a/llm/src/llm_b.py b/llm/src/llm_b.py
@@ -0,0 +1,78 @@
+
+# llm_b.py 
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+# <<< put your token here >>>
+hf_auth = "add token here"
+
+# Model 
+model_id_2 = "meta-llama/Llama-2-70b-chat-hf"
+
+# Require GPU 
+has_cuda = torch.cuda.is_available()
+if not has_cuda:
+    raise RuntimeError("CUDA GPU required for this configuration.")
+dtype = torch.bfloat16 if has_cuda else torch.float32
+
+# 4-bit quantization
+bnb_config = None
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=dtype,
+        )
+    except Exception:
+        bnb_config = None
+
+# Tokenizer
+tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True)
+if tokenizer_2.pad_token_id is None:
+    tokenizer_2.pad_token = tokenizer_2.eos_token
+
+# Shared kwargs
+common = dict(
+    device_map="auto" if has_cuda else None,
+    dtype=dtype,
+    low_cpu_mem_usage=True,
+)
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
+
+# Model
+model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common)
+model_2.eval()
+
+# Generation args
+GEN_B = dict(
+    max_new_tokens=24, do_sample=False, temperature=0.1,
+    eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id
+)
+
+# One-line postprocess
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+# Agent 2 
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer_2(prompt, return_tensors="pt")
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model_2.generate(**inputs, **GEN_B)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True)
+    print(result)
+    return postprocess(result)