Skip to content

too many values to unpack (expected 3) #77718

@jilieryuyi

Description

@jilieryuyi

bug描述 Describe the Bug

FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04

RUN pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
RUN pip install paddleocr

代码:

#!/usr/bin/env python3
"""
PaddleOCR 修复版 - 适配最新 API
"""

import os
import sys
import json
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# 设置环境变量
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

# 尝试导入 PaddleOCR
try:
    from paddleocr import PaddleOCR

    print("✅ PaddleOCR 导入成功")
    print(f"PaddleOCR 版本信息: 使用最新 API")
except ImportError as e:
    print(f"❌ 导入 PaddleOCR 失败: {e}")
    sys.exit(1)


class FixedPaddleOCRAnalyzer:
    """修复后的 PaddleOCR 分析器"""

    def __init__(self, use_gpu=False, lang='ch', ocr_version='PP-OCRv4'):
        """
        初始化 PaddleOCR

        参数:
            use_gpu: 是否使用 GPU
            lang: 语言 ('ch' 或 'en')
            ocr_version: OCR 版本 ('PP-OCRv3', 'PP-OCRv4', 'PP-OCRv5')
        """
        print(f"🚀 初始化 PaddleOCR (版本: {ocr_version}, 语言: {lang})...")

        try:
            # 使用最新的 API 参数
            self.ocr = PaddleOCR(
                # 文档方向检测
                use_doc_orientation_classify=False,

                # 文档矫正
                use_doc_unwarping=False,

                # 文本检测
                text_detection_model_name=None,  # 自动选择
                #text_det_limit_side_len=960,  # 图片长边限制
                text_det_limit_type='max',  # 限制方式
                text_det_thresh=0.3,  # 检测阈值
                text_det_box_thresh=0.6,  # 文本框阈值
                text_det_unclip_ratio=1.5,  # 文本框扩展比例

                # 文本识别
                text_recognition_model_name=None,  # 自动选择
                text_rec_score_thresh=0.5,  # 识别置信度阈值
                text_rec_input_shape='3, 48, 320',  # 识别输入形状

                # 文本方向分类
                #use_textline_orientation=True,  # 启用文本方向分类
                textline_orientation_model_name=None,  # 自动选择

                # 批次大小
                textline_orientation_batch_size=1,
                #text_recognition_batch_size=1,

                # 其他参数
                lang=lang,  # 语言
                ocr_version=ocr_version,  # OCR 版本
                # show_log=False,  # 不显示详细日志
                #use_gpu=use_gpu,  # 是否使用 GPU

                # 兼容性参数(会映射到新参数)
                #det_limit_side_len=960,  # 映射到 text_det_limit_side_len
                rec_batch_num=1,  # 映射到 text_recognition_batch_size
                use_angle_cls=True,  # 映射到 use_textline_orientation
            )

            print(f"✅ PaddleOCR 初始化成功")

        except Exception as e:
            print(f"❌ 初始化失败: {e}")
            print("尝试使用简化初始化...")

            # 简化初始化
            self.ocr = PaddleOCR(
                use_angle_cls=True,
                lang=lang,
                #use_gpu=use_gpu,
                #show_log=False
            )
            print("✅ 使用简化初始化完成")

    def analyze_image(self, image_path):
        """
        分析图片

        返回:
            dict: 包含图片和分析结果
        """
        print(f"\n📄 分析图片: {os.path.basename(image_path)}")

        # 检查文件
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"图片不存在: {image_path}")

        # 读取图片
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"无法读取图片: {image_path}")

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        height, width = img.shape[:2]

        print(f"📏 图片尺寸: {width} x {height}")

        # 执行 OCR
        start_time = time.time()
        try:
            result = self.ocr.predict( image_path )
        except Exception as e:
            print(f"❌ OCR 分析失败: {e}")
            # 尝试直接传递图片
            result = self.ocr.ocr(img )

        analysis_time = time.time() - start_time
        print(f"⏱️  分析耗时: {analysis_time:.2f}秒")

        # 解析结果
        text_blocks = self._parse_ocr_result(result, width, height)

        return {
            'image': img_rgb,
            'image_shape': (height, width),
            'analysis_result': result,
            'text_blocks': text_blocks,
            'analysis_time': analysis_time,
            'num_blocks': len(text_blocks)
        }

    def _parse_ocr_result(self, result, img_width, img_height):
        """
        解析 OCR 结果

        注意: PaddleOCR 的结果结构:
        [
            [  # 第一张图片
                [  # 第一个文本框
                    [[x1, y1], [x2, y2], [x3, y3], [x4, y4]],  # 文本框坐标
                    [text, confidence]  # 文本和置信度
                ],
                ...
            ]
        ]
        """
        text_blocks = []

        if not result or len(result) == 0:
            return text_blocks

        # 处理第一张图片的结果
        for line in result[0]:
            if len(line) >= 2:
                bbox = line[0]  # 文本框坐标
                text_info = line[1]  # 文本信息

                if len(text_info) >= 2:
                    text = str(text_info[0])
                    confidence = float(text_info[1])

                    # 转换为 numpy 数组方便计算
                    points = np.array(bbox, dtype=np.float32)

                    # 计算边界框
                    x_coords = points[:, 0]
                    y_coords = points[:, 1]
                    x1, y1 = np.min(x_coords), np.min(y_coords)
                    x2, y2 = np.max(x_coords), np.max(y_coords)

                    # 计算中心点
                    center_x = np.mean(x_coords)
                    center_y = np.mean(y_coords)

                    # 计算面积
                    area = (x2 - x1) * (y2 - y1)
                    area_percentage = (area / (img_width * img_height)) * 100

                    block = {
                        'bbox': bbox,
                        'bbox_rect': [float(x1), float(y1), float(x2), float(y2)],
                        'center': [float(center_x), float(center_y)],
                        'text': text,
                        'confidence': confidence,
                        'area': float(area),
                        'area_percentage': float(area_percentage),
                        'width': float(x2 - x1),
                        'height': float(y2 - y1)
                    }
                    text_blocks.append(block)

        # 按置信度排序
        text_blocks.sort(key=lambda x: x['confidence'], reverse=True)

        return text_blocks

    def visualize_results(self, result, output_path, dpi=300):
        """可视化结果"""
        print(f"\n🎨 生成可视化结果...")

        img = result['image']
        text_blocks = result['text_blocks']

        # 创建图形
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

        # 1. 原始图片
        ax1.imshow(img)
        ax1.set_title('原始图片', fontsize=16, fontweight='bold')
        ax1.axis('off')

        # 2. 分析结果
        result_img = img.copy()

        # 使用颜色映射
        colors = plt.cm.rainbow(np.linspace(0, 1, len(text_blocks)))

        for idx, block in enumerate(text_blocks):
            color = colors[idx % len(colors)]
            color_bgr = (int(color[2] * 255), int(color[1] * 255), int(color[0] * 255))

            # 绘制多边形
            points = np.array(block['bbox'], dtype=np.int32)
            cv2.polylines(result_img, [points], isClosed=True,
                          color=color_bgr, thickness=3)

            # 绘制中心点
            center_x, center_y = int(block['center'][0]), int(block['center'][1])
            cv2.circle(result_img, (center_x, center_y), 5, color_bgr, -1)

            # 添加编号和置信度
            text = f"{idx + 1}:{block['confidence']:.2f}"
            cv2.putText(result_img, text, (center_x, center_y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color_bgr, 2)

            # 如果是高置信度文本,显示内容
            if block['confidence'] > 0.8 and len(block['text']) < 20:
                cv2.putText(result_img, block['text'][:15],
                            (center_x, center_y + 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, color_bgr, 1)

        ax2.imshow(result_img)
        ax2.set_title(f'OCR 结果 (检测到 {len(text_blocks)} 个文本框)',
                      fontsize=16, fontweight='bold')
        ax2.axis('off')

        # 添加统计信息
        stats_text = (f"分析统计:\n"
                      f"文本框数: {len(text_blocks)}\n"
                      f"分析时间: {result['analysis_time']:.2f}秒\n"
                      f"平均置信度: {np.mean([b['confidence'] for b in text_blocks]):.3f}")

        fig.text(0.5, 0.02, stats_text, ha='center', fontsize=12,
                 bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.8))

        plt.suptitle('PaddleOCR 文本检测与识别', fontsize=20, fontweight='bold', y=0.98)
        plt.tight_layout(rect=[0, 0.05, 1, 0.95])

        # 保存
        plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
        plt.close()

        print(f"✅ 可视化结果已保存: {output_path}")
        return output_path

    def save_text_results(self, result, output_path):
        """保存文本结果"""
        text_blocks = result['text_blocks']

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("=" * 60 + "\n")
            f.write("PADDLEOCR 文本检测与识别结果\n")
            f.write("=" * 60 + "\n\n")

            f.write(f"分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"图片尺寸: {result['image_shape'][1]} x {result['image_shape'][0]}\n")
            f.write(f"分析耗时: {result['analysis_time']:.2f}秒\n")
            f.write(f"检测到文本框数: {len(text_blocks)}\n\n")

            # 统计置信度分布
            confidences = [b['confidence'] for b in text_blocks]
            if confidences:
                f.write(f"置信度统计:\n")
                f.write(f"  最高: {max(confidences):.4f}\n")
                f.write(f"  最低: {min(confidences):.4f}\n")
                f.write(f"  平均: {np.mean(confidences):.4f}\n")
                f.write(f"  中位数: {np.median(confidences):.4f}\n\n")

            f.write("-" * 60 + "\n")
            f.write("📋 详细结果:\n")
            f.write("-" * 60 + "\n\n")

            for idx, block in enumerate(text_blocks, 1):
                f.write(f"文本框 {idx}:\n")
                f.write(f"  文本: {block['text']}\n")
                f.write(f"  置信度: {block['confidence']:.4f}\n")
                f.write(f"  位置: {block['bbox_rect']}\n")
                f.write(f"  尺寸: {block['width']:.1f} x {block['height']:.1f}\n")
                f.write(f"  面积占比: {block['area_percentage']:.2f}%\n")
                f.write("-" * 40 + "\n")

            # 所有文本合并
            f.write("\n" + "=" * 60 + "\n")
            f.write("📄 完整文本内容:\n")
            f.write("=" * 60 + "\n\n")

            for idx, block in enumerate(text_blocks, 1):
                f.write(f"[{idx}] {block['text']}\n")

        print(f"✅ 文本结果已保存: {output_path}")
        return output_path

    def save_json_results(self, result, output_path):
        """保存 JSON 结果"""
        data = {
            'metadata': {
                'filename': os.path.basename(output_path).replace('.json', ''),
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'tool': 'PaddleOCR',
                'api_version': 'latest'
            },
            'image_info': {
                'width': result['image_shape'][1],
                'height': result['image_shape'][0],
                'channels': 3
            },
            'analysis_info': {
                'analysis_time': result['analysis_time'],
                'num_blocks': len(result['text_blocks']),
                'avg_confidence': float(np.mean([b['confidence'] for b in result['text_blocks']]))
            },
            'results': result['text_blocks']
        }

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"✅ JSON 结果已保存: {output_path}")
        return output_path

    def export_excel(self, result, output_path):
        """导出 Excel 结果"""
        try:
            import pandas as pd

            data = []
            for idx, block in enumerate(result['text_blocks'], 1):
                data.append({
                    '序号': idx,
                    '文本内容': block['text'],
                    '置信度': block['confidence'],
                    '左上角X': block['bbox_rect'][0],
                    '左上角Y': block['bbox_rect'][1],
                    '右下角X': block['bbox_rect'][2],
                    '右下角Y': block['bbox_rect'][3],
                    '宽度': block['width'],
                    '高度': block['height'],
                    '面积占比%': block['area_percentage']
                })

            df = pd.DataFrame(data)
            df.to_excel(output_path, index=False)
            print(f"✅ Excel 结果已保存: {output_path}")
            return output_path

        except ImportError:
            print("⚠️  未安装 pandas,跳过 Excel 导出")
            return None

    def process_image(self, image_path, output_dir='./output',
                      visualize=True, save_json=True, save_txt=True, save_excel=False):
        """处理单张图片"""
        # 创建输出目录
        os.makedirs(output_dir, exist_ok=True)

        # 获取文件名
        filename = os.path.splitext(os.path.basename(image_path))[0]

        # 分析图片
        try:
            result = self.analyze_image(image_path)
        except Exception as e:
            print(f"❌ 图片分析失败: {e}")
            return None

        output_files = {}

        # 可视化结果
        if visualize:
            viz_path = os.path.join(output_dir, f"{filename}_result.jpg")
            output_files['visualization'] = self.visualize_results(result, viz_path)

        # 文本结果
        if save_txt:
            txt_path = os.path.join(output_dir, f"{filename}_result.txt")
            output_files['text'] = self.save_text_results(result, txt_path)

        # JSON 结果
        if save_json:
            json_path = os.path.join(output_dir, f"{filename}_result.json")
            output_files['json'] = self.save_json_results(result, json_path)

        # Excel 结果
        if save_excel:
            excel_path = os.path.join(output_dir, f"{filename}_result.xlsx")
            excel_file = self.export_excel(result, excel_path)
            if excel_file:
                output_files['excel'] = excel_file

        # 打印摘要
        self._print_summary(result, output_files)

        return {
            'analysis_result': result,
            'output_files': output_files
        }

    def _print_summary(self, result, output_files):
        """打印处理摘要"""
        print("\n" + "=" * 60)
        print("📄 处理摘要")
        print("=" * 60)

        print(f"图片尺寸: {result['image_shape'][1]} x {result['image_shape'][0]}")
        print(f"检测到文本框数: {len(result['text_blocks'])}")
        print(f"分析耗时: {result['analysis_time']:.2f}秒")

        if result['text_blocks']:
            confidences = [b['confidence'] for b in result['text_blocks']]
            print(f"置信度统计:")
            print(f"  最高: {max(confidences):.4f}")
            print(f"  最低: {min(confidences):.4f}")
            print(f"  平均: {np.mean(confidences):.4f}")

        print("\n生成的文件:")
        for file_type, file_path in output_files.items():
            print(f"  {file_type}: {os.path.basename(file_path)}")

        print("=" * 60)


def main():
    """主函数"""
    import argparse

    parser = argparse.ArgumentParser(description='PaddleOCR 文本检测与识别工具')
    parser.add_argument('--input', '-i', type=str, required=True,
                        help='输入图片路径')
    parser.add_argument('--output', '-o', type=str, default='./output',
                        help='输出目录')
    parser.add_argument('--gpu', action='store_true',
                        help='使用 GPU 加速')
    parser.add_argument('--lang', type=str, default='ch',
                        help='语言 (ch: 中文, en: 英文)')
    parser.add_argument('--version', type=str, default='PP-OCRv4',
                        help='OCR 版本 (PP-OCRv3, PP-OCRv4, PP-OCRv5)')

    args = parser.parse_args()

    # 检查输入文件
    if not os.path.exists(args.input):
        print(f"❌ 输入文件不存在: {args.input}")
        return 1

    print("=" * 60)
    print("PaddleOCR 文本检测与识别工具")
    print("=" * 60)
    print(f"输入文件: {args.input}")
    print(f"输出目录: {args.output}")
    print(f"使用 GPU: {args.gpu}")
    print(f"语言: {args.lang}")
    print(f"OCR 版本: {args.version}")
    print("=" * 60)

    try:
        # 初始化分析器
        analyzer = FixedPaddleOCRAnalyzer(
           # use_gpu=args.gpu,
            lang=args.lang,
            ocr_version=args.version
        )

        # 处理图片
        result = analyzer.process_image(
            image_path=args.input,
            output_dir=args.output,
            visualize=True,
            save_json=True,
            save_txt=True,
            save_excel=True
        )

        if result:
            print("\n✅ 处理完成!")
            return 0
        else:
            print("\n❌ 处理失败!")
            return 1

    except Exception as e:
        print(f"\n❌ 处理过程中发生错误: {e}")
        import traceback
        traceback.print_exc()
        return 1


# 直接运行示例
if __name__ == "__main__":
    if len(sys.argv) == 1:
        # 如果没有参数,显示帮助
        print("使用示例:")
        print("  python fixed_paddleocr.py --input ./input/document.jpg --output ./output")
        print("\n或直接运行测试:")

        # 测试代码
        test_image = "./input/document.jpg"
        if not os.path.exists(test_image):
            print(f"\n⚠️  请先创建测试图片目录: mkdir input")
            print(f"并将图片放入: {test_image}")
            print("\n正在创建示例图片...")

            os.makedirs("./input", exist_ok=True)

            # 创建示例图片
            img = np.ones((600, 800, 3), dtype=np.uint8) * 255

            # 添加文本
            cv2.putText(img, "文档标题", (200, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 3)
            cv2.putText(img, "这是文档的第一段内容。", (50, 200),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
            cv2.putText(img, "这是第二段内容,包含一些重要的信息。", (50, 250),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
            cv2.putText(img, "表格数据:", (50, 350),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 2)
            cv2.putText(img, "项目A   |   100元", (100, 400),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
            cv2.putText(img, "项目B   |   200元", (100, 450),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
            cv2.putText(img, "文档结束", (300, 550),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)

            cv2.imwrite(test_image, img)
            print(f"✅ 已创建示例图片: {test_image}")

        # 运行分析
        analyzer = FixedPaddleOCRAnalyzer(use_gpu=False, lang='ch')
        analyzer.process_image(test_image, './output')
    else:
        sys.exit(main())

docker build -t vl .

docker run --rm -v C:/Users/xx/Desktop/PaddleOCR-VL/images:/app/images -v C:/Users/xx/Desktop/PaddleOCR-VL/models:/root/.paddlex/ --gpus all vl

输出:

Connectivity check to the model hoster has been skipped because PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK is enabled.
Creating model: ('PP-LCNet_x1_0_textline_ori', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori.
Creating model: ('PP-OCRv4_mobile_det', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-OCRv4_mobile_det.
Creating model: ('PP-OCRv4_mobile_rec', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-OCRv4_mobile_rec.
✅ PaddleOCR 导入成功
PaddleOCR 版本信息: 使用最新 API

PaddleOCR 文本检测与识别工具

输入文件: /app/images/1.png
输出目录: /app/images
使用 GPU: False
语言: ch
OCR 版本: PP-OCRv4

🚀 初始化 PaddleOCR (版本: PP-OCRv4, 语言: ch)...
✅ PaddleOCR 初始化成功

📄 分析图片: 1.png
📏 图片尺寸: 1005 x 1553
❌ OCR 分析失败: too many values to unpack (expected 3)
❌ 图片分析失败: too many values to unpack (expected 3)

❌ 处理失败!

too many values to unpack (expected 3) 这个报错是啥问题,换了好几个版本都不行!

其他补充信息 Additional Supplementary Information

No response

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions