Skip to content
Open
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
183748c
Initial plan
Copilot Jan 19, 2026
9654117
Add Qwen3 language model support to pythainlp.lm
Copilot Jan 19, 2026
0e4b368
Address code review feedback: Fix type annotations and docstrings
Copilot Jan 19, 2026
e53c777
Fix device validation and improve docstring clarity
Copilot Jan 19, 2026
fe937d2
Fix device handling: use .to() instead of device_map parameter
Copilot Jan 19, 2026
2018813
Move Qwen3 tests from tests/core to tests/extra
Copilot Jan 19, 2026
69fbb42
Update copyright year and sort qwen3 dependency alphabetically
Copilot Jan 19, 2026
c816818
Update copyright year in testx_lm.py
bact Jan 19, 2026
0d104dd
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Jan 19, 2026
772a80b
Update pythainlp/lm/qwen3.py
bact Jan 23, 2026
91b7bda
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 1, 2026
e2a14de
Update CITATION.cff from codemeta.json
Feb 1, 2026
b29e085
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 1, 2026
f55c1c6
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 2, 2026
c73b2a3
Apply code review feedback: Add input validation, error handling, and…
Copilot Feb 2, 2026
79a4204
Apply second round of code review feedback: Fix copyright headers, im…
Copilot Feb 2, 2026
e775b9e
Add type annotations for torch_dtype in WangChanGLM and ChatBotModel …
Copilot Feb 2, 2026
0b031fc
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 3, 2026
94d4314
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 3, 2026
1339261
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
Copilot Feb 3, 2026
5b3416a
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
bact Feb 4, 2026
5ecd247
Complete type annotations for Qwen3 class
Copilot Feb 4, 2026
88762b0
Replace Any type annotations with specific types from transformers li…
Copilot Feb 4, 2026
1e3f0b6
Fix type annotation reassignment in ChatBotModel
Copilot Feb 4, 2026
28869e0
Import torch in TYPE_CHECKING block
bact Feb 4, 2026
06b5d5f
Add import for torch in ChatBotModel
bact Feb 4, 2026
2d06f22
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
bact Feb 4, 2026
aa8751b
Remove duplicate import statement
bact Feb 4, 2026
3d550ce
Fix ruff import sorting error in chat/core.py
Copilot Feb 4, 2026
0b52d0d
Merge branch 'dev' into copilot/add-qwen3-0-6b-model
Copilot Feb 5, 2026
b9bf7fc
Apply code review feedback: Improve code quality and documentation
Copilot Feb 5, 2026
ff07202
Apply dependency and import improvements from code review
Copilot Feb 5, 2026
df97e0c
Fix torch import scope and type annotation consistency
Copilot Feb 6, 2026
ffb73ee
Remove AutoTokenizer import from core.py
bact Feb 6, 2026
2a9f6e9
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into cop…
Copilot Feb 6, 2026
91303d3
Move Qwen3 tests from extra to noauto_torch suite and sync with dev b…
Copilot Feb 6, 2026
432c2ab
Improve dependency error handling and type annotations in lm module
Copilot Feb 6, 2026
f529a9b
Reorder import statements in __init__.py
bact Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ onnx = ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"]

oskut = ["oskut>=1.3"]

qwen3 = ["torch>=1.9.0", "transformers>=4.22.1"]

sefr_cut = ["sefr_cut>=1.1"]

spacy_thai = ["spacy_thai>=0.7.1"]
Expand Down
3 changes: 1 addition & 2 deletions pythainlp/chat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from pythainlp.generate.wangchanglm import WangChanGLM


class ChatBotModel:
history: list[tuple[str, str]]
model: "WangChanGLM"
Expand Down Expand Up @@ -39,7 +38,7 @@ def load_model(
:param bool return_dict: return_dict
:param bool load_in_8bit: load model in 8bit
:param str device: device (cpu, cuda or other)
:param torch_dtype torch_dtype: torch_dtype
:param Optional[torch.dtype] torch_dtype: torch_dtype
:param str offload_folder: offload folder
:param bool low_cpu_mem_usage: low cpu mem usage
"""
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/generate/wangchanglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def load_model(
:param bool return_dict: return dict
:param bool load_in_8bit: load model in 8bit
:param str device: device (cpu, cuda or other)
:param torch_dtype torch_dtype: torch_dtype
:param Optional[torch.dtype] torch_dtype: torch_dtype
:param str offload_folder: offload folder
:param bool low_cpu_mem_usage: low cpu mem usage
"""
Expand Down
14 changes: 13 additions & 1 deletion pythainlp/lm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,21 @@
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

__all__: list[str] = ["calculate_ngram_counts", "remove_repeated_ngrams"]
__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"]

from pythainlp.lm.text_util import (
calculate_ngram_counts,
remove_repeated_ngrams,
)

try:
from pythainlp.lm.qwen3 import Qwen3
except ImportError:
# If dependencies are not installed, make Qwen3 available but raise
# error when instantiated
class Qwen3: # type: ignore
def __init__(self) -> None:
raise ImportError(
"Qwen3 requires additional dependencies. "
"Install with: pip install pythainlp[qwen3]"
)
260 changes: 260 additions & 0 deletions pythainlp/lm/qwen3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Optional

if TYPE_CHECKING:
import torch
from transformers import PreTrainedModel, PreTrainedTokenizerBase


class Qwen3:
"""Qwen3-0.6B language model for Thai text generation.

A small but capable language model from Alibaba Cloud's Qwen family,
optimized for various NLP tasks including Thai language processing.
"""

def __init__(self) -> None:
self.model: Optional["PreTrainedModel"] = None
self.tokenizer: Optional["PreTrainedTokenizerBase"] = None
self.device: Optional[str] = None
self.torch_dtype: Optional["torch.dtype"] = None
self.model_path: Optional[str] = None

def load_model(
self,
model_path: str = "Qwen/Qwen3-0.6B",
device: str = "cuda",
torch_dtype: Optional["torch.dtype"] = None,
low_cpu_mem_usage: bool = True,
) -> None:
"""Load Qwen3 model.

:param str model_path: model path or HuggingFace model ID
:param str device: device (cpu, cuda or other)
:param Optional[torch.dtype] torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16)
:param bool low_cpu_mem_usage: low cpu mem usage

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set default torch_dtype if not provided
if torch_dtype is None:
torch_dtype = torch.float16

# Check CUDA availability early before loading model
if device.startswith("cuda"):
if not torch.cuda.is_available():
raise RuntimeError(
"CUDA device requested but CUDA is not available. "
"Check your PyTorch installation and GPU drivers, or use "
"device='cpu' instead."
)

self.device = device
self.torch_dtype = torch_dtype
self.model_path = model_path

try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
except OSError as exc:
raise RuntimeError(
f"Failed to load tokenizer from '{self.model_path}'. "
"Check the model path or your network connection."
) from exc

try:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
device_map=device,
torch_dtype=torch_dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
)
except OSError as exc:
# Clean up tokenizer on failure
self.tokenizer = None
raise RuntimeError(
f"Failed to load model from '{self.model_path}'. "
"This can happen due to an invalid model path, missing files, "
"or insufficient disk space."
) from exc
except Exception as exc:
# Clean up tokenizer on failure
self.tokenizer = None
raise RuntimeError(
f"Failed to load model weights: {exc}. "
"This can be caused by insufficient memory, an incompatible "
"torch_dtype setting, or other configuration issues."
) from exc

def generate(
self,
text: str,
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The generate method lacks input validation for the text parameter. If an empty string or None is passed, it may cause unclear errors downstream. Consider adding validation to check that text is a non-empty string before processing.

Copilot uses AI. Check for mistakes.
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
do_sample: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate text from a prompt.

:param str text: input text prompt
:param int max_new_tokens: maximum number of new tokens to generate
:param float temperature: temperature for sampling (higher = more random)
:param float top_p: top p for nucleus sampling
:param int top_k: top k for top-k sampling
:param bool do_sample: whether to use sampling or greedy decoding
:param bool skip_special_tokens: skip special tokens in output
:return: generated text
:rtype: str

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)

result = model.generate("สวัสดี")
print(result)
"""
if self.model is None or self.tokenizer is None or self.device is None:
raise RuntimeError(
"Model not loaded. Please call load_model() first."
)

if not text or not isinstance(text, str):
raise ValueError(
"text parameter must be a non-empty string."
)

import torch

inputs = self.tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(self.device)

# Note: When do_sample=False (greedy decoding), temperature, top_p,
# and top_k parameters are ignored by the transformers library
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
do_sample=do_sample,
)
Comment on lines +167 to +174
Copy link

Copilot AI Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do_sample=False (greedy decoding), the temperature, top_p, and top_k parameters are ignored by the transformers library. Consider adding validation to warn users or handle this case explicitly. The current implementation may mislead users who set do_sample=False but also provide temperature values expecting them to have an effect.

Copilot uses AI. Check for mistakes.

# Decode only the newly generated tokens
# output_ids and input_ids are guaranteed to be 2D tensors with
# batch size 1 from the tokenizer call above
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
Comment on lines 176 to 180
Copy link

Copilot AI Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The slicing operation output_ids[0][len(input_ids[0]):] assumes that output_ids[0] and input_ids[0] are present. While this should generally be safe given the model.generate call, it would be more defensive to check the shapes or handle potential IndexError. Consider adding a check or comment explaining why this is safe in this context.

Suggested change
# Decode only the newly generated tokens
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
# Decode only the newly generated tokens.
if (
output_ids.dim() == 2
and input_ids.dim() == 2
and output_ids.size(0) > 0
and input_ids.size(0) > 0
):
start_idx = input_ids.size(1)
generated_ids = output_ids[0, start_idx:]
else:
raise RuntimeError(
"Unexpected tensor shape from model.generate(); "
"expected 2D tensors with non-empty batch dimension."
)
generated_text = self.tokenizer.decode(
generated_ids,

Copilot uses AI. Check for mistakes.
skip_special_tokens=skip_special_tokens,
)

return generated_text

def chat(
self,
messages: list[dict[str, Any]],
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chat method lacks input validation for the messages parameter. If an empty list is passed, the method will proceed without error but may produce unexpected behavior. Consider adding a check to validate that messages is not empty and contains properly formatted message dictionaries with 'role' and 'content' keys.

Copilot uses AI. Check for mistakes.
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
do_sample: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate text using chat format.

:param list[dict[str, Any]] messages: list of message dictionaries with 'role' and 'content' keys
:param int max_new_tokens: maximum number of new tokens to generate
:param float temperature: temperature for sampling
:param float top_p: top p for nucleus sampling
:param int top_k: top k for top-k sampling
:param bool do_sample: whether to use sampling
:param bool skip_special_tokens: skip special tokens in output
:return: generated response
:rtype: str

:Example:
::

from pythainlp.lm import Qwen3
import torch

model = Qwen3()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)

messages = [{"role": "user", "content": "สวัสดีครับ"}]
response = model.chat(messages)
print(response)
"""
if self.model is None or self.tokenizer is None or self.device is None:
raise RuntimeError(
"Model not loaded. Please call load_model() first."
)

if not messages or not isinstance(messages, list):
raise ValueError(
"messages parameter must be a non-empty list of message dictionaries."
)

# Apply chat template if available, otherwise format manually
if hasattr(self.tokenizer, "apply_chat_template"):
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
else:
# Simple fallback format - preserve content newlines
lines = []
for msg in messages:
role = str(msg.get("role", "user")).replace("\n", " ")
content = str(msg.get("content", ""))
lines.append(f"{role}: {content}")
text = "\n".join(lines) + "\nassistant: "

import torch

inputs = self.tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(self.device)

# Note: When do_sample=False (greedy decoding), temperature, top_p,
# and top_k parameters are ignored by the transformers library
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
do_sample=do_sample,
)

# Decode only the newly generated tokens
# output_ids and input_ids are guaranteed to be 2D tensors with
# batch size 1 from the tokenizer call above
generated_text = self.tokenizer.decode(
output_ids[0][len(input_ids[0]) :],
skip_special_tokens=skip_special_tokens,
)

return generated_text
24 changes: 15 additions & 9 deletions pythainlp/phayathaibert/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@
from typing import TYPE_CHECKING, Union

if TYPE_CHECKING:
from transformers import CamembertTokenizer
from transformers.pipelines.base import Pipeline
from transformers import (
AutoModelForMaskedLM,
AutoModelForTokenClassification,
AutoTokenizer,

Check failure on line 16 in pythainlp/phayathaibert/core.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

pythainlp/phayathaibert/core.py:16:9: F401 `transformers.AutoTokenizer` imported but unused
CamembertTokenizer,
Pipeline,
PreTrainedTokenizerBase,
)

from transformers import (
CamembertTokenizer,
Expand Down Expand Up @@ -212,13 +218,13 @@
pipeline,
)

self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(
self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained(
_model_name
)
self.model_for_masked_lm: AutoModelForMaskedLM = (
self.model_for_masked_lm: "AutoModelForMaskedLM" = (
AutoModelForMaskedLM.from_pretrained(_model_name)
)
self.model: "Pipeline" = pipeline(
self.model: "Pipeline" = pipeline( # transformers.Pipeline
"fill-mask",
tokenizer=self.tokenizer,
model=self.model_for_masked_lm,
Expand Down Expand Up @@ -311,8 +317,8 @@
AutoTokenizer,
)

self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model)
self.model: AutoModelForTokenClassification = (
self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained(model)
self.model: "AutoModelForTokenClassification" = (
AutoModelForTokenClassification.from_pretrained(model)
)

Expand Down Expand Up @@ -356,8 +362,8 @@
AutoTokenizer,
)

self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model)
self.model: AutoModelForTokenClassification = (
self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained(model)
self.model: "AutoModelForTokenClassification" = (
AutoModelForTokenClassification.from_pretrained(model)
)

Expand Down
1 change: 1 addition & 0 deletions tests/extra/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"tests.extra.testx_augment",
"tests.extra.testx_benchmarks",
"tests.extra.testx_cli",
"tests.extra.testx_lm",
"tests.extra.testx_spell",
"tests.extra.testx_tag",
"tests.extra.testx_tokenize",
Expand Down
Loading
Loading