Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
183748c
Initial plan
Copilot Jan 19, 2026
9654117
Add Qwen3 language model support to pythainlp.lm
Copilot Jan 19, 2026
0e4b368
Address code review feedback: Fix type annotations and docstrings
Copilot Jan 19, 2026
e53c777
Fix device validation and improve docstring clarity
Copilot Jan 19, 2026
fe937d2
Fix device handling: use .to() instead of device_map parameter
Copilot Jan 19, 2026
2018813
Move Qwen3 tests from tests/core to tests/extra
Copilot Jan 19, 2026
69fbb42
Update copyright year and sort qwen3 dependency alphabetically
Copilot Jan 19, 2026
c816818
Update copyright year in testx_lm.py
bact Jan 19, 2026
0d104dd
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Jan 19, 2026
772a80b
Update pythainlp/lm/qwen3.py
bact Jan 23, 2026
91b7bda
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 1, 2026
e2a14de
Update CITATION.cff from codemeta.json
Feb 1, 2026
b29e085
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 1, 2026
f55c1c6
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 2, 2026
c73b2a3
Apply code review feedback: Add input validation, error handling, and…
Copilot Feb 2, 2026
79a4204
Apply second round of code review feedback: Fix copyright headers, im…
Copilot Feb 2, 2026
e775b9e
Add type annotations for torch_dtype in WangChanGLM and ChatBotModel …
Copilot Feb 2, 2026
0b031fc
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 3, 2026
94d4314
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 3, 2026
1339261
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
Copilot Feb 3, 2026
5b3416a
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
bact Feb 4, 2026
5ecd247
Complete type annotations for Qwen3 class
Copilot Feb 4, 2026
88762b0
Replace Any type annotations with specific types from transformers li…
Copilot Feb 4, 2026
1e3f0b6
Fix type annotation reassignment in ChatBotModel
Copilot Feb 4, 2026
28869e0
Import torch in TYPE_CHECKING block
bact Feb 4, 2026
06b5d5f
Add import for torch in ChatBotModel
bact Feb 4, 2026
2d06f22
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
bact Feb 4, 2026
aa8751b
Remove duplicate import statement
bact Feb 4, 2026
3d550ce
Fix ruff import sorting error in chat/core.py
Copilot Feb 4, 2026
0b52d0d
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
Copilot Feb 5, 2026
b9bf7fc
Apply code review feedback: Improve code quality and documentation
Copilot Feb 5, 2026
ff07202
Apply dependency and import improvements from code review
Copilot Feb 5, 2026
df97e0c
Fix torch import scope and type annotation consistency
Copilot Feb 6, 2026
ffb73ee
Remove AutoTokenizer import from core.py
bact Feb 6, 2026
2a9f6e9
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 6, 2026
91303d3
Move Qwen3 tests from extra to noauto_torch suite and sync with dev b…
Copilot Feb 6, 2026
432c2ab
Improve dependency error handling and type annotations in lm module
Copilot Feb 6, 2026
f529a9b
Reorder import statements in __init__.py
bact Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ wtp = ["transformers>=4.22.1", "wtpsplit>=1.0.1"]

wunsen = ["wunsen>=0.0.3"]

qwen3 = ["torch>=1.0.0", "transformers>=4.22.1"]

# Compact dependencies - safe small set of optional dependencies
compact = [
"nlpo3>=1.3.1",
Expand Down
14 changes: 13 additions & 1 deletion pythainlp/lm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,21 @@
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams"]
__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"]

from pythainlp.lm.text_util import (
calculate_ngram_counts,
remove_repeated_ngrams,
)

try:
from pythainlp.lm.qwen3 import Qwen3
except ImportError:
# If dependencies are not installed, make Qwen3 available but raise
# error when instantiated
class Qwen3: # type: ignore
def __init__(self):
raise ImportError(
"Qwen3 requires additional dependencies. "
"Install with: pip install pythainlp[qwen3]"
)
Copy link

Copilot AI Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fallback Qwen3 class defined when ImportError occurs (lines 17-22) lacks type annotations. According to the custom coding guidelines, the codebase should maintain near-100% type annotation coverage. Add return type annotation -> None to the init method for consistency with the codebase's type annotation standards.

Copilot generated this review using guidance from repository custom instructions.
197 changes: 197 additions & 0 deletions pythainlp/lm/qwen3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from typing import Any

import torch


class Qwen3:
"""Qwen3-0.6B language model for Thai text generation.

A small but capable language model from Alibaba Cloud's Qwen family,
optimized for various NLP tasks including Thai language processing.
"""

def __init__(self):
self.model = None
self.tokenizer = None
self.device = None
self.torch_dtype = None
self.model_path = None

def load_model(
self,
model_path: str = "Qwen/Qwen3-0.6B",
device: str = "cuda",
torch_dtype=torch.float16,
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The torch_dtype parameter lacks a type annotation. While it's documented as accepting torch data types (e.g., torch.float16, torch.bfloat16), it should have a proper type hint. Consider using typing.Optional[torch.dtype] or similar to maintain consistency with the codebase's use of type hints.

Copilot generated this review using guidance from repository custom instructions.
low_cpu_mem_usage: bool = True,
):
"""Load Qwen3 model.

:param str model_path: model path or HuggingFace model ID
:param str device: device (cpu, cuda or other)
:param torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16)
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation for the torch_dtype parameter is missing the type specification. The codebase convention is to use the format :param type name: description. This should be :param Optional[torch.dtype] torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16) to match the rest of the codebase style and provide complete documentation.

Copilot generated this review using guidance from repository custom instructions.
:param bool low_cpu_mem_usage: low cpu mem usage

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)
"""
from transformers import AutoModelForCausalLM, AutoTokenizer

self.device = device
self.torch_dtype = torch_dtype
self.model_path = model_path

self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=torch_dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
)
self.model.to(device)

Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The load_model method does not handle potential exceptions that could occur during model loading (e.g., network errors, invalid model paths, insufficient memory, or CUDA availability issues). Consider adding try-except blocks with informative error messages to help users diagnose common problems, especially for device-related errors.

Suggested change
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=torch_dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
)
self.model.to(device)
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
except OSError as exc:
raise RuntimeError(
f"Failed to load tokenizer from '{self.model_path}'. "
"Check the model path or your network connection."
) from exc
try:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=torch_dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
)
except OSError as exc:
raise RuntimeError(
f"Failed to load model from '{self.model_path}'. "
"This can happen due to an invalid model path, missing files, "
"or insufficient disk space."
) from exc
except RuntimeError as exc:
raise RuntimeError(
"Failed to load model weights. "
"This can be caused by insufficient memory or an incompatible "
"torch_dtype setting."
) from exc
if isinstance(device, str) and device.startswith("cuda"):
if not torch.cuda.is_available():
raise RuntimeError(
"CUDA device requested but CUDA is not available. "
"Check your PyTorch installation and GPU drivers, or use "
"device='cpu' instead."
)
try:
self.model.to(device)
except RuntimeError as exc:
raise RuntimeError(
f"Failed to move model to device '{device}'. "
"Ensure the device exists and has enough memory, and that your "
"PyTorch installation supports this device."
) from exc

Copilot uses AI. Check for mistakes.
def generate(
self,
text: str,
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The generate method lacks input validation for the text parameter. If an empty string or None is passed, it may cause unclear errors downstream. Consider adding validation to check that text is a non-empty string before processing.

Copilot uses AI. Check for mistakes.
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
do_sample: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate text from a prompt.

:param str text: input text prompt
:param int max_new_tokens: maximum number of new tokens to generate
:param float temperature: temperature for sampling (higher = more random)
:param float top_p: top p for nucleus sampling
:param int top_k: top k for top-k sampling
:param bool do_sample: whether to use sampling or greedy decoding
:param bool skip_special_tokens: skip special tokens in output
:return: generated text
:rtype: str

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)

result = model.generate("สวัสดี")
print(result)
"""
if self.model is None or self.tokenizer is None or self.device is None:
raise RuntimeError(
"Model not loaded. Please call load_model() first."
)

inputs = self.tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(self.device)

with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
do_sample=do_sample,
)
Comment on lines +167 to +174
Copy link

Copilot AI Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do_sample=False (greedy decoding), the temperature, top_p, and top_k parameters are ignored by the transformers library. Consider adding validation to warn users or handle this case explicitly. The current implementation may mislead users who set do_sample=False but also provide temperature values expecting them to have an effect.

Copilot uses AI. Check for mistakes.

# Decode only the newly generated tokens
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
Comment on lines 176 to 180
Copy link

Copilot AI Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The slicing operation output_ids[0][len(input_ids[0]):] assumes that output_ids[0] and input_ids[0] are present. While this should generally be safe given the model.generate call, it would be more defensive to check the shapes or handle potential IndexError. Consider adding a check or comment explaining why this is safe in this context.

Suggested change
# Decode only the newly generated tokens
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
# Decode only the newly generated tokens.
if (
output_ids.dim() == 2
and input_ids.dim() == 2
and output_ids.size(0) > 0
and input_ids.size(0) > 0
):
start_idx = input_ids.size(1)
generated_ids = output_ids[0, start_idx:]
else:
raise RuntimeError(
"Unexpected tensor shape from model.generate(); "
"expected 2D tensors with non-empty batch dimension."
)
generated_text = self.tokenizer.decode(
generated_ids,

Copilot uses AI. Check for mistakes.
skip_special_tokens=skip_special_tokens,
)

return generated_text

def chat(
self,
messages: list[dict[str, Any]],
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chat method lacks input validation for the messages parameter. If an empty list is passed, the method will proceed without error but may produce unexpected behavior. Consider adding a check to validate that messages is not empty and contains properly formatted message dictionaries with 'role' and 'content' keys.

Copilot uses AI. Check for mistakes.
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
do_sample: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate text using chat format.

:param list[dict[str, Any]] messages: list of message dictionaries with 'role' and 'content' keys
:param int max_new_tokens: maximum number of new tokens to generate
:param float temperature: temperature for sampling
:param float top_p: top p for nucleus sampling
:param int top_k: top k for top-k sampling
:param bool do_sample: whether to use sampling
:param bool skip_special_tokens: skip special tokens in output
:return: generated response
:rtype: str

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)

messages = [{"role": "user", "content": "สวัสดีครับ"}]
response = model.chat(messages)
print(response)
"""
if self.model is None or self.tokenizer is None or self.device is None:
raise RuntimeError(
"Model not loaded. Please call load_model() first."
)

# Apply chat template if available, otherwise format manually
if hasattr(self.tokenizer, "apply_chat_template"):
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
else:
# Simple fallback format
text = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fallback chat template formatting (lines 172-177) could produce ambiguous output if message content contains newline characters. Consider either sanitizing the content to remove/escape newlines, or using a more robust delimiter that won't be present in natural text.

Suggested change
role = msg.get("role", "user")
content = msg.get("content", "")
role = str(msg.get("role", "user")).replace("\n", " ")
content = str(msg.get("content", "")).replace("\n", "\\n")

Copilot uses AI. Check for mistakes.
text += f"{role}: {content}\n"
text += "assistant: "

inputs = self.tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(self.device)

with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
do_sample=do_sample,
)

# Decode only the newly generated tokens
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
skip_special_tokens=skip_special_tokens,
)

return generated_text
40 changes: 40 additions & 0 deletions tests/extra/testx_lm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp.lm import Qwen3


class LMTestCaseX(unittest.TestCase):
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test class name "LMTestCaseX" does not follow the naming convention established in tests/extra/ and documented in PR #1248. According to the 4-tier test organization, extra-tier test classes should use a descriptive module-based prefix followed by "TestCaseX" suffix. Based on the pattern in other extra test files (e.g., "GenerateTestCaseX", "AugmentTestCaseX"), this class should be named "LmTestCaseX" or more descriptively, "LanguageModelTestCaseX".

Copilot generated this review using guidance from repository custom instructions.
def test_qwen3_initialization(self):
# Test that Qwen3 can be instantiated
try:
model = Qwen3()
self.assertIsNotNone(model)
self.assertIsNone(model.model)
self.assertIsNone(model.tokenizer)
except ImportError:
# Skip if dependencies not installed
self.skipTest("Qwen3 dependencies not installed")

def test_qwen3_generate_without_load(self):
# Test that generate raises error when model is not loaded
try:
model = Qwen3()
with self.assertRaises(RuntimeError):
model.generate("test")
except ImportError:
# Skip if dependencies not installed
self.skipTest("Qwen3 dependencies not installed")

def test_qwen3_chat_without_load(self):
# Test that chat raises error when model is not loaded
try:
model = Qwen3()
with self.assertRaises(RuntimeError):
model.chat([{"role": "user", "content": "test"}])
except ImportError:
# Skip if dependencies not installed
self.skipTest("Qwen3 dependencies not installed")
Loading